Chapter 3 Data statistics
3.1 Sequencing reads statistics
sample_metadata %>%
summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2),
mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
tt()| Total | Average |
|---|---|
| 398.22 | 4.33 ± 1.45 |
3.2 DNA fractions
sequence_fractions <- read_counts %>%
pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
group_by(sample) %>%
summarise(mags = sum(value)) %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
dplyr::select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
mutate(mags_bases = mags*146) %>%
mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
dplyr::select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)
sequence_fractions %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
tt()| Sample | Low quality | Mapped to host | Unmapped | Mapped to MAGs |
|---|---|---|---|---|
| EHI01436 | 0.10323450 | 0.002155664 | 1.9223563 | 0.14057376 |
| EHI01437 | 0.14688974 | 0.001776019 | 1.3951997 | 3.75777530 |
| EHI01438 | 0.11341031 | 0.001443667 | 1.0781908 | 2.93324556 |
| EHI01439 | 0.09710566 | 0.000553848 | 0.7633965 | 2.39854085 |
| EHI01440 | 0.08161119 | 0.000412702 | 0.6588022 | 2.80294070 |
| EHI01441 | 0.09842650 | 0.001532668 | 0.9259464 | 2.26467053 |
| EHI01442 | 0.09044909 | 0.003948668 | 1.3099571 | 1.91223033 |
| EHI01443 | 0.08349080 | 0.001670709 | 0.7703155 | 1.97118600 |
| EHI01444 | 0.15356698 | 0.008828658 | 1.8761546 | 2.25276496 |
| EHI01445 | 0.24321310 | 0.000493809 | 0.6808145 | 2.86800064 |
| EHI01447 | 0.07308100 | 0.000928745 | 0.5335369 | 1.86430335 |
| EHI01448 | 0.16654049 | 0.001447619 | 1.2343569 | 4.00302946 |
| EHI01449 | 0.08086772 | 0.000712056 | 1.9015472 | 0.56772173 |
| EHI01450 | 0.09513485 | 0.000808493 | 0.6801877 | 2.40271908 |
| EHI01451 | 0.15486528 | 0.006054971 | 1.2002374 | 3.09457074 |
| EHI01584 | 0.31311426 | 0.001124632 | 1.1945509 | 4.46459065 |
| EHI01585 | 0.43461750 | 0.010705116 | 4.7173126 | 5.56319787 |
| EHI01610 | 0.09609182 | 0.002231717 | 0.7319288 | 2.22137350 |
| EHI01611 | 0.12854248 | 0.002113926 | 2.0768251 | 1.11056915 |
| EHI01612 | 0.09445979 | 0.000396947 | 0.8434826 | 2.27477884 |
| EHI01613 | 0.11771235 | 0.003535185 | 0.8236243 | 2.34411760 |
| EHI01616 | 0.13311594 | 0.000846441 | 1.0689190 | 2.28105071 |
| EHI01617 | 0.10496582 | 0.000600034 | 0.7624687 | 1.66612338 |
| EHI01618 | 0.15796519 | 0.001938639 | 1.4700412 | 1.45306500 |
| EHI01619 | 0.21423836 | 0.001610432 | 2.1137921 | 2.57152267 |
| EHI01620 | 0.15125219 | 0.002882104 | 1.5452853 | 1.20720363 |
| EHI01621 | 0.14437434 | 0.000908440 | 0.8938030 | 2.84473642 |
| EHI01623 | 0.13905131 | 0.000111578 | 0.8158237 | 2.89356495 |
| EHI01625 | 0.13457413 | 0.004361014 | 0.9973074 | 3.14405452 |
| EHI01627 | 0.11510846 | 0.000652453 | 0.8491846 | 2.40277675 |
| EHI01628 | 0.13073679 | 0.002521283 | 1.2403353 | 1.86084796 |
| EHI01631 | 0.09932800 | 0.001705834 | 0.8044096 | 1.77531722 |
| EHI02085 | 0.57919563 | 0.018462021 | 2.6410532 | 1.29059649 |
| EHI02095 | 0.48834211 | 0.013513952 | 3.4049831 | 3.18378200 |
| EHI02104 | 0.53195157 | 0.007010599 | 2.7354830 | 0.81316072 |
| EHI02109 | 1.28374828 | 0.008847543 | 4.9223937 | 0.24490536 |
| EHI02115 | 0.92901178 | 0.003428911 | 3.9141515 | 0.19065498 |
| EHI02128 | 0.96743572 | 0.007418441 | 4.7627289 | 0.16898799 |
| EHI02571 | 0.13287769 | 0.000830065 | 1.3326271 | 2.42337662 |
| EHI02572 | 0.23097718 | 0.001104376 | 4.2180228 | 0.66197962 |
| EHI02573 | 0.23772138 | 0.000919831 | 1.4201215 | 2.90177102 |
| EHI02574 | 0.18641515 | 0.001105944 | 1.2909807 | 2.51445595 |
| EHI02575 | 0.16800006 | 0.000451116 | 0.7701716 | 3.10908854 |
| EHI02576 | 0.27054446 | 0.001466596 | 1.2373298 | 4.37179582 |
| EHI02577 | 0.13875010 | 0.000159264 | 0.6656173 | 3.11920576 |
| EHI02578 | 0.18840997 | 0.000370817 | 0.9281657 | 3.79342865 |
| EHI02579 | 0.17040671 | 0.000316117 | 1.1304582 | 2.89912711 |
| EHI02580 | 0.14585212 | 0.002285096 | 1.1005491 | 2.55888783 |
| EHI02581 | 0.31534501 | 0.004222759 | 1.1406810 | 3.45948869 |
| EHI02583 | 0.19024490 | 0.002688647 | 2.5186672 | 1.41498835 |
| EHI02586 | 0.16808497 | 0.000634686 | 1.2191413 | 2.86012073 |
| EHI02588 | 0.22724830 | 0.000620576 | 1.3616533 | 3.96253913 |
| EHI02589 | 0.17868765 | 0.001172553 | 1.3032228 | 2.88262020 |
| EHI02590 | NA | NA | NA | 0.01003195 |
| EHI02591 | 0.24672078 | 0.005870858 | 5.1327559 | 0.11609000 |
| EHI02593 | 0.28535188 | 0.001047289 | 1.7734727 | 3.51223396 |
| EHI02594 | 0.14799113 | 0.001254245 | 2.2455458 | 0.78283346 |
| EHI02595 | 0.17594927 | 0.000390443 | 0.8947388 | 2.65116816 |
| EHI02596 | 0.19552510 | 0.000402096 | 0.8853602 | 3.90930111 |
| EHI02597 | 0.16997179 | 0.003302624 | 1.8718822 | 2.27029869 |
| EHI02599 | 0.24535794 | 0.002994709 | 1.6290468 | 3.25617683 |
| EHI02600 | 0.16539330 | 0.000372951 | 2.5885522 | 0.78914504 |
| EHI02601 | 0.20440033 | 0.000546487 | 1.2005389 | 3.29478799 |
| EHI02604 | 0.13350290 | 0.000033019 | 0.5459000 | 2.24831357 |
| EHI02605 | 0.23336073 | 0.000578456 | 0.7867636 | 4.00240210 |
| EHI02606 | 0.21269906 | 0.000840830 | 0.9069484 | 4.52838330 |
| EHI02608 | 0.16608092 | 0.000868246 | 1.2007537 | 2.21539480 |
| EHI02609 | 0.18162303 | 0.000620091 | 1.2032065 | 3.45803993 |
| EHI02610 | 0.18029349 | 0.000359149 | 1.0625221 | 3.04269811 |
| EHI02611 | 0.14935861 | 0.000280047 | 0.9678059 | 2.31031466 |
| EHI02613 | 0.13705908 | 0.001431597 | 0.6736314 | 2.75385288 |
| EHI02614 | 0.18096790 | 0.000783821 | 1.1284081 | 2.27501989 |
| EHI02616 | 0.14713007 | 0.000239190 | 0.5108695 | 2.35465880 |
| EHI02618 | 0.17147141 | 0.000675857 | 0.9085123 | 2.63753628 |
| EHI02620 | 0.15234767 | 0.000669926 | 0.8504433 | 2.26631084 |
| EHI02621 | 0.18176981 | 0.000880545 | 2.0099358 | 2.33263631 |
| EHI02622 | 0.20546598 | 0.000285185 | 1.1929782 | 3.39019490 |
| EHI02623 | 0.14304619 | 0.000230072 | 0.5719310 | 2.72557253 |
| EHI02626 | 0.20016240 | 0.000600853 | 0.9178175 | 2.61054132 |
| EHI02627 | 0.16663385 | 0.001572916 | 2.4785902 | 0.65110510 |
| EHI02628 | 0.14066292 | 0.000620136 | 0.5260281 | 1.86407529 |
| EHI02629 | 0.15442873 | 0.001035638 | 1.3085973 | 2.62663840 |
| EHI02631 | 0.28751065 | 0.003383238 | 4.2392404 | 1.34326512 |
| EHI02634 | 0.27934868 | 0.001091891 | 2.4750414 | 3.24088595 |
| EHI02635 | 0.30611130 | 0.001240215 | 1.3218018 | 4.73514062 |
| EHI02636 | 0.33858819 | 0.000408731 | 1.2870087 | 6.26143207 |
| EHI02637 | 0.25767107 | 0.000313024 | 1.3477516 | 5.69477117 |
| EHI02638 | 0.26777696 | 0.001869868 | 1.9786471 | 3.30618562 |
| EHI02640 | 0.20192954 | 0.002568987 | 1.1840044 | 3.29878050 |
| EHI02641 | 0.53193661 | 0.006973676 | 3.8010965 | 4.56692336 |
| EHI02642 | 0.35162886 | 0.000765507 | 5.4173877 | 1.45875812 |
| EHI02643 | 0.33426863 | 0.001865962 | 5.7701855 | 1.11141741 |
| EHI02646 | 0.35059944 | 0.008032777 | 6.5018324 | 1.15256692 |
sequence_fractions %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
tt()| Low quality | Mapped to host | Unmapped | Mapped to MAGs |
|---|---|---|---|
| 0.2261139 | 0.002232012 | 1.70865 | 2.507462 |
sequence_fractions %>%
pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
mutate(value = value / 1000000000) %>%
mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
left_join(sample_metadata, by = join_by(sample == sample))%>%
ggplot(., aes(x = sample, y = value, fill=fraction)) +
geom_bar(position="stack", stat = "identity") +
scale_fill_manual(name="Sequence type",
breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
facet_grid(~region, scales="free")+
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")3.3 Recovered microbial fraction
singlem_table <- sequence_fractions %>%
mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
dplyr::select(sample,mags_proportion,singlem_proportion) %>%
mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify
singlem_table %>%
pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
ggplot(., aes(x = value, y = sample, color=proportion)) +
geom_line(aes(group = sample), color = "#f8a538") +
geom_point() +
scale_color_manual(name="Proportion",
breaks=c("mags_proportion","singlem_proportion"),
labels=c("Recovered","Estimated"),
values=c("#52e1e8", "#876b53"))+
theme_classic() +
labs(y = "Samples", x = "Prokaryotic fraction (%)") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")damr <- singlem_table %>%
mutate(damr=ifelse(is.na(singlem_proportion),100,mags_proportion/singlem_proportion*100)) %>%
dplyr::select(sample,damr)
damr %>%
tt()| sample | damr |
|---|---|
| EHI01436 | 14.498616 |
| EHI01437 | 99.116488 |
| EHI01438 | 98.837524 |
| EHI01439 | 100.000000 |
| EHI01440 | 100.000000 |
| EHI01441 | 100.000000 |
| EHI01442 | 95.602448 |
| EHI01443 | 100.000000 |
| EHI01444 | 95.853830 |
| EHI01445 | 100.000000 |
| EHI01447 | 94.747746 |
| EHI01448 | 100.000000 |
| EHI01449 | 39.258880 |
| EHI01450 | 100.000000 |
| EHI01451 | 100.000000 |
| EHI01584 | 100.000000 |
| EHI01585 | 100.000000 |
| EHI01610 | 100.000000 |
| EHI01611 | 59.100933 |
| EHI01612 | 100.000000 |
| EHI01613 | 100.000000 |
| EHI01616 | 100.000000 |
| EHI01617 | 100.000000 |
| EHI01618 | 100.000000 |
| EHI01619 | 100.000000 |
| EHI01620 | 100.000000 |
| EHI01621 | 100.000000 |
| EHI01623 | 100.000000 |
| EHI01625 | 100.000000 |
| EHI01627 | 100.000000 |
| EHI01628 | 100.000000 |
| EHI01631 | 100.000000 |
| EHI02085 | 100.000000 |
| EHI02095 | 100.000000 |
| EHI02104 | 46.583977 |
| EHI02109 | 12.599681 |
| EHI02115 | 8.541973 |
| EHI02128 | 8.243211 |
| EHI02571 | 100.000000 |
| EHI02572 | 24.180328 |
| EHI02573 | 100.000000 |
| EHI02574 | 85.807038 |
| EHI02575 | 100.000000 |
| EHI02576 | 100.000000 |
| EHI02577 | 100.000000 |
| EHI02578 | 100.000000 |
| EHI02579 | 100.000000 |
| EHI02580 | 100.000000 |
| EHI02581 | 100.000000 |
| EHI02583 | 64.485479 |
| EHI02586 | 100.000000 |
| EHI02588 | 100.000000 |
| EHI02589 | 100.000000 |
| EHI02590 | 100.000000 |
| EHI02591 | 79.211470 |
| EHI02593 | 100.000000 |
| EHI02594 | 48.390116 |
| EHI02595 | 100.000000 |
| EHI02596 | 100.000000 |
| EHI02597 | 100.000000 |
| EHI02599 | 100.000000 |
| EHI02600 | 41.916383 |
| EHI02601 | 100.000000 |
| EHI02604 | 100.000000 |
| EHI02605 | 100.000000 |
| EHI02606 | 100.000000 |
| EHI02608 | 100.000000 |
| EHI02609 | 100.000000 |
| EHI02610 | 100.000000 |
| EHI02611 | 100.000000 |
| EHI02613 | 100.000000 |
| EHI02614 | 100.000000 |
| EHI02616 | 100.000000 |
| EHI02618 | 100.000000 |
| EHI02620 | 100.000000 |
| EHI02621 | 100.000000 |
| EHI02622 | 100.000000 |
| EHI02623 | 100.000000 |
| EHI02626 | 100.000000 |
| EHI02627 | 38.053421 |
| EHI02628 | 100.000000 |
| EHI02629 | 100.000000 |
| EHI02631 | 42.964286 |
| EHI02634 | 100.000000 |
| EHI02635 | 100.000000 |
| EHI02636 | 100.000000 |
| EHI02637 | 100.000000 |
| EHI02638 | 93.150685 |
| EHI02640 | 100.000000 |
| EHI02641 | 98.715862 |
| EHI02642 | 36.752729 |
| EHI02643 | 49.328039 |
| EHI02646 | 28.060369 |
[1] 73
| mean | sd |
|---|---|
| 88.21507 | 25.46559 |
3.3.1 Samples discarded (damr < 80)
samples_lowdamr <- damr %>%
filter(damr<80) %>%
select(sample)
sample_metadata[sample_metadata$sample %in% samples_lowdamr$sample,] %>% count(region, type, season, sort = TRUE)# A tibble: 8 × 4
region type season n
<chr> <chr> <chr> <int>
1 Irun natural autumn 6
2 Aia protected autumn 3
3 Orio urban autumn 2
4 Usurbil natural spring 2
5 Usurbil natural autumn 1
6 Villabona natural autumn 1
7 Zarautz protected autumn 1
8 Zarautz protected spring 1
sample_metadata[!sample_metadata$sample %in% samples_lowdamr$sample,] %>% count(region, type, season,sort = TRUE)
sample_metadata_filtdamr <- sample_metadata[!sample_metadata$sample %in% samples_lowdamr$sample,]
genome_counts_filtdamr <- genome_counts_filt %>%
select(c("genome",sample_metadata_filtdamr$sample))%>%
filter(rowSums(across(where(is.numeric)))!=0)
genome_metadata_filtdamr <- genome_metadata %>%
semi_join(., genome_counts_filtdamr, by = "genome") %>%
arrange(match(genome,genome_counts_filtdamr$genome))
genome_metadata_removed <- genome_metadata %>%
anti_join(., genome_counts_filtdamr, by = "genome") %>%
arrange(match(genome,genome_counts_filtdamr$genome))
genome_tree_filtdamr <- keep.tip(genome_tree, tip=genome_metadata_filtdamr$genome)
genome_counts_filt_func <- genome_counts_filt[genome_counts_filt$genome %in% rownames(genome_gifts),]
genome_gifts_filtdamr <- genome_gifts[rownames(genome_gifts) %in% genome_counts_filtdamr$genome,]
genome_gifts_filtdamr <- genome_gifts_filtdamr[, colSums(genome_gifts_filtdamr != 0) > 0]
phylum_colors_filtdamr <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata_filtdamr, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree_filtdamr$tip.label)) %>%
select(phylum, colors) %>%
unique() %>%
arrange(phylum) %>%
pull(colors, name=phylum)